In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 7

**Submission Date:** `2025-11-12, 23:59 IST`

**Last Updated:** `{curr}`
"""
)

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import (
    fetch_california_housing,
    load_iris,
    fetch_20newsgroups,
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import (
    r2_score,
    root_mean_squared_error,
    confusion_matrix,
    classification_report,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## Question 1

Load the `fetch_california_housing` dataset from sklearn. Split the data using train_test_split with `test_size = 0.2` and **random_state as 1**. Train the following models using the training data and calculate the **R2 Score** and **Root Mean Squared Error (RMSE)** on the test data.

- Model 1: Linear Regression
- Model 2: AdaBoost Regressor with `random state = 1`
- Model 3: Decision Tree Regressor with `random state = 1`

Choose the appropriate option:

In [None]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

In [None]:
lr = LinearRegression()
ada = AdaBoostRegressor(random_state=1)
dtr = DecisionTreeRegressor(random_state=1)

In [None]:
lr.fit(X_train, y_train)
ada.fit(X_train, y_train)
dtr.fit(X_train, y_train)

In [None]:
def score(model, scoring="r2"):
    metric = r2_score
    if scoring == "rmse":
        metric = root_mean_squared_error

    return metric(y_test, model.predict(X_test))

In [None]:
score(lr), score(ada), score(dtr)

In [None]:
(
    score(lr, scoring="rmse"),
    score(ada, scoring="rmse"),
    score(dtr, scoring="rmse"),
)

## Question 2 - 3

Load the Iris Dataset from sklearn. Split the data using train_test_split with test_size = 0.33 and random_state as 1. Train the Logistic Regression model (with random state as 1) on the training data and make predictions on the test data. Print the confusion matrix and the classification report

In [None]:
X2, y2 = load_iris(as_frame=True, return_X_y=True)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X2, y2, test_size=0.33, random_state=1
)

In [None]:
lgr = LogisticRegression(random_state=1)
lgr.fit(X_train2, y_train2)

In [None]:
_y_pred = lgr.predict(X_test2)

print("Confusion Matrix:\n", confusion_matrix(y_test2, _y_pred), "\n\n")
print("Classification Report:\n", classification_report(y_test2, _y_pred))

### Question 2

How many samples has the model misclassified?

In [None]:
(lgr.predict(X_test2) != y_test2).sum()

### Question 3

What is the recall for class 1? [Answer to two decimal places] (Note that the classes are 0 , 1, 2)

0.95 (from above classfication report)

## Question 4 - 5

Load the **train subset** of '20newsgroups' data with `return_X_y = True`. Vectorize X using TfidfVectorizer.

In [None]:
X3, y3 = fetch_20newsgroups(subset="train", return_X_y=True)

In [None]:
X3[:100], y3[:100]

In [None]:
vec = TfidfVectorizer()
X3_tfidf = vec.fit_transform(X3)

In [None]:
print("Shape:", X3_tfidf.shape)
print("Number of stored elements (non-zero):", X3_tfidf.nnz)
print(
    "Sparsity: {:.2f}%".format(
        100 * X3_tfidf.nnz / (X3_tfidf.shape[0] * X3_tfidf.shape[1])
    )
)

### Question 4

Which of the following options represent the shape of the fitted and transformed data X?

In [None]:
X3_tfidf.shape

### Question 5

Split the vectorized data and label into train and validation sets using `train_test_split` with `test_size = 0.3` and `random_state = 1`. Train a MultinomialNB model on the training dataset and compute the score on the validation set. Enter the score obtained.

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X3_tfidf, y3, test_size=0.3, random_state=1
)

In [None]:
nmb = MultinomialNB()
nmb.fit(X_train3, y_train3)

In [None]:
nmb.score(X_test3, y_test3)